## Rows: 11,314
## Columns: 106
## $ id <dbl> 10595, 10990, 10993, 109…
## $ listing_url <chr> "https://www.airbnb.com/…
## $ scrape_id <dbl> 2.02e+13, 2.02e+13, 2.02…
## $ last_scraped <date> 2020-06-17, 2020-06-17,…
## $ name <chr> "96m2, 3BR, 2BA, Metro, …
## $ summary <chr> "Athens Furnished Apartm…
## $ space <chr> "Athens Furnished Apartm…
## $ description <chr> "Athens Furnished Apartm…
## $ experiences_offered <chr> "none", "none", "none", …
## $ neighborhood_overview <chr> "Ampelokipi district is …
## $ notes <chr> "Although is very easy t…
## $ transit <chr> "Note: 5-day ticket for …
## $ access <chr> "Guest have access to al…
## $ interaction <chr> "-Our reception is 10 me…
## $ house_rules <chr> "- Parties, meetings, ap…
## $ thumbnail_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ medium_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ picture_url <chr> "https://a0.muscache.com…
## $ xl_picture_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ host_id <dbl> 37177, 37177, 37177, 371…
## $ host_url <chr> "https://www.airbnb.com/…
## $ host_name <chr> "Emmanouil", "Emmanouil"…
## $ host_since <date> 2009-09-08, 2009-09-08,…
## $ host_location <chr> "Athens, Attica, Greece"…
## $ host_about <chr> "Athens Quality Apartmen…
## $ host_response_time <chr> "within an hour", "withi…
## $ host_response_rate <chr> "100%", "100%", "100%", …
## $ host_acceptance_rate <chr> "100%", "100%", "100%", …
## $ host_is_superhost <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ host_thumbnail_url <chr> "https://a0.muscache.com…
## $ host_picture_url <chr> "https://a0.muscache.com…
## $ host_neighbourhood <chr> "Ambelokipi", "Ambelokip…
## $ host_listings_count <dbl> 6, 6, 6, 6, 6, 2, 1, 2, …
## $ host_total_listings_count <dbl> 6, 6, 6, 6, 6, 2, 1, 2, …
## $ host_verifications <chr> "['email', 'phone', 'rev…
## $ host_has_profile_pic <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ host_identity_verified <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ street <chr> "Athens, Attica, Greece"…
## $ neighbourhood <chr> "Ambelokipi", "Ambelokip…
## $ neighbourhood_cleansed <chr> "ΑΜΠΕΛΟΚΗΠΟΙ", "ΑΜΠΕΛΟΚΗ…
## $ neighbourhood_group_cleansed <lgl> NA, NA, NA, NA, NA, NA, …
## $ city <chr> "Athens", "Athens", "Ath…
## $ state <chr> "Attica", "Attica", "Att…
## $ zipcode <chr> "11526", "11526", "115 2…
## $ market <chr> "Athens", "Athens", "Ath…
## $ smart_location <chr> "Athens, Greece", "Athen…
## $ country_code <chr> "GR", "GR", "GR", "GR", …
## $ country <chr> "Greece", "Greece", "Gre…
## $ latitude <dbl> 38, 38, 38, 38, 38, 38, …
## $ longitude <dbl> 23.8, 23.8, 23.8, 23.8, …
## $ is_location_exact <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ property_type <chr> "Apartment", "Apartment"…
## $ room_type <chr> "Entire home/apt", "Enti…
## $ accommodates <dbl> 8, 4, 2, 4, 4, 4, 1, 5, …
## $ bathrooms <dbl> 2.0, 1.0, 1.0, 1.0, 1.0,…
## $ bedrooms <dbl> 3, 1, 0, 1, 1, 1, 1, 2, …
## $ beds <dbl> 5, 1, 1, 2, 1, 2, 1, 2, …
## $ bed_type <chr> "Real Bed", "Real Bed", …
## $ amenities <chr> "{TV,\"Cable TV\",Intern…
## $ square_feet <dbl> 1076, NA, NA, NA, NA, 64…
## $ price <chr> "$122.00", "$45.00", "$3…
## $ weekly_price <chr> "$700.00", "$420.00", "$…
## $ monthly_price <chr> "$2,800.00", "$1,680.00"…
## $ security_deposit <chr> "$0.00", "$0.00", "$0.00…
## $ cleaning_fee <chr> "$25.00", "$15.00", "$10…
## $ guests_included <dbl> 4, 2, 2, 2, 2, 2, 1, 1, …
## $ extra_people <chr> "$13.00", "$5.00", "$0.0…
## $ minimum_nights <dbl> 1, 1, 1, 1, 1, 2, 1, 5, …
## $ maximum_nights <dbl> 45, 60, 60, 60, 30, 730,…
## $ minimum_minimum_nights <dbl> 1, 1, 1, 1, 1, 2, 1, 5, …
## $ maximum_minimum_nights <dbl> 4, 4, 4, 4, 4, 2, 1, 5, …
## $ minimum_maximum_nights <dbl> 45, 60, 60, 60, 30, 1125…
## $ maximum_maximum_nights <dbl> 45, 60, 60, 60, 30, 1125…
## $ minimum_nights_avg_ntm <dbl> 1.3, 1.4, 1.8, 1.5, 1.4,…
## $ maximum_nights_avg_ntm <dbl> 45, 60, 60, 60, 30, 1125…
## $ calendar_updated <chr> "2 weeks ago", "2 weeks …
## $ has_availability <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ availability_30 <dbl> 30, 13, 13, 17, 12, 30, …
## $ availability_60 <dbl> 60, 43, 43, 47, 42, 60, …
## $ availability_90 <dbl> 90, 73, 72, 77, 69, 83, …
## $ availability_365 <dbl> 365, 271, 347, 275, 262,…
## $ calendar_last_scraped <date> 2020-06-17, 2020-06-17,…
## $ number_of_reviews <dbl> 25, 34, 48, 21, 17, 454,…
## $ number_of_reviews_ltm <dbl> 5, 3, 1, 2, 1, 31, 0, 2,…
## $ first_review <date> 2011-05-20, 2012-09-06,…
## $ last_review <date> 2020-03-15, 2020-01-08,…
## $ review_scores_rating <dbl> 97, 98, 97, 96, 95, 96, …
## $ review_scores_accuracy <dbl> 10, 10, 10, 10, 10, 10, …
## $ review_scores_cleanliness <dbl> 10, 10, 10, 10, 10, 10, …
## $ review_scores_checkin <dbl> 10, 10, 10, 10, 10, 10, …
## $ review_scores_communication <dbl> 10, 10, 10, 10, 10, 10, …
## $ review_scores_location <dbl> 9, 10, 10, 9, 9, 10, NA,…
## $ review_scores_value <dbl> 10, 10, 10, 10, 9, 10, N…
## $ requires_license <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ license <chr> "478825", "400315", "478…
## $ jurisdiction_names <lgl> NA, NA, NA, NA, NA, NA, …
## $ instant_bookable <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ is_business_travel_ready <lgl> FALSE, FALSE, FALSE, FAL…
## $ cancellation_policy <chr> "moderate", "moderate", …
## $ require_guest_profile_picture <lgl> FALSE, FALSE, FALSE, FAL…
## $ require_guest_phone_verification <lgl> FALSE, FALSE, FALSE, FAL…
## $ calculated_host_listings_count <dbl> 6, 6, 6, 6, 6, 2, 1, 2, …
## $ calculated_host_listings_count_entire_homes <dbl> 6, 6, 6, 6, 6, 2, 0, 2, …
## $ calculated_host_listings_count_private_rooms <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ calculated_host_listings_count_shared_rooms <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reviews_per_month <dbl> 0.23, 0.36, 0.51, 0.17, …
| Name | athens_data |
| Number of rows | 11314 |
| Number of columns | 106 |
| _______________________ | |
| Column type frequency: | |
| character | 47 |
| Date | 5 |
| logical | 15 |
| numeric | 39 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| listing_url | 0 | 1.00 | 34 | 37 | 0 | 11314 | 0 |
| name | 7 | 1.00 | 1 | 98 | 0 | 11114 | 0 |
| summary | 314 | 0.97 | 1 | 1000 | 0 | 10140 | 0 |
| space | 2804 | 0.75 | 1 | 1000 | 0 | 7793 | 0 |
| description | 176 | 0.98 | 1 | 1000 | 0 | 10600 | 0 |
| experiences_offered | 0 | 1.00 | 4 | 4 | 0 | 1 | 0 |
| neighborhood_overview | 3139 | 0.72 | 1 | 1000 | 0 | 6611 | 0 |
| notes | 6750 | 0.40 | 1 | 1000 | 0 | 3465 | 0 |
| transit | 3403 | 0.70 | 1 | 1000 | 0 | 6470 | 0 |
| access | 5447 | 0.52 | 1 | 1000 | 0 | 4552 | 0 |
| interaction | 4427 | 0.61 | 1 | 1000 | 0 | 5010 | 0 |
| house_rules | 5782 | 0.49 | 2 | 1000 | 0 | 4155 | 0 |
| picture_url | 0 | 1.00 | 81 | 146 | 0 | 11201 | 0 |
| host_url | 0 | 1.00 | 39 | 43 | 0 | 6272 | 0 |
| host_name | 1 | 1.00 | 1 | 33 | 0 | 2650 | 0 |
| host_location | 17 | 1.00 | 2 | 94 | 0 | 575 | 0 |
| host_about | 4823 | 0.57 | 1 | 4636 | 0 | 3036 | 9 |
| host_response_time | 1 | 1.00 | 3 | 18 | 0 | 5 | 0 |
| host_response_rate | 1 | 1.00 | 2 | 4 | 0 | 47 | 0 |
| host_acceptance_rate | 1 | 1.00 | 2 | 4 | 0 | 76 | 0 |
| host_thumbnail_url | 1 | 1.00 | 55 | 106 | 0 | 6249 | 0 |
| host_picture_url | 1 | 1.00 | 57 | 109 | 0 | 6249 | 0 |
| host_neighbourhood | 1826 | 0.84 | 4 | 29 | 0 | 66 | 0 |
| host_verifications | 0 | 1.00 | 2 | 147 | 0 | 218 | 0 |
| street | 0 | 1.00 | 10 | 62 | 0 | 274 | 0 |
| neighbourhood | 1 | 1.00 | 4 | 17 | 0 | 32 | 0 |
| neighbourhood_cleansed | 0 | 1.00 | 4 | 32 | 0 | 45 | 0 |
| city | 4 | 1.00 | 2 | 30 | 0 | 94 | 0 |
| state | 10261 | 0.09 | 1 | 38 | 0 | 132 | 0 |
| zipcode | 262 | 0.98 | 5 | 12 | 0 | 212 | 0 |
| market | 126 | 0.99 | 6 | 21 | 0 | 2 | 0 |
| smart_location | 0 | 1.00 | 10 | 39 | 0 | 102 | 0 |
| country_code | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
| country | 0 | 1.00 | 6 | 6 | 0 | 1 | 0 |
| property_type | 0 | 1.00 | 4 | 23 | 0 | 26 | 0 |
| room_type | 0 | 1.00 | 10 | 15 | 0 | 4 | 0 |
| bed_type | 0 | 1.00 | 5 | 13 | 0 | 5 | 0 |
| amenities | 0 | 1.00 | 2 | 1646 | 0 | 10540 | 0 |
| price | 0 | 1.00 | 5 | 9 | 0 | 279 | 0 |
| weekly_price | 10790 | 0.05 | 6 | 9 | 0 | 148 | 0 |
| monthly_price | 10817 | 0.04 | 7 | 10 | 0 | 156 | 0 |
| security_deposit | 2984 | 0.74 | 5 | 9 | 0 | 94 | 0 |
| cleaning_fee | 1852 | 0.84 | 5 | 7 | 0 | 85 | 0 |
| extra_people | 0 | 1.00 | 5 | 7 | 0 | 39 | 0 |
| calendar_updated | 0 | 1.00 | 5 | 13 | 0 | 71 | 0 |
| license | 3949 | 0.65 | 4 | 140 | 0 | 6735 | 0 |
| cancellation_policy | 0 | 1.00 | 8 | 27 | 0 | 6 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| last_scraped | 0 | 1.00 | 2020-06-16 | 2020-06-18 | 2020-06-16 | 3 |
| host_since | 1 | 1.00 | 2009-09-08 | 2020-06-11 | 2017-03-06 | 2423 |
| calendar_last_scraped | 0 | 1.00 | 2020-06-16 | 2020-06-18 | 2020-06-16 | 3 |
| first_review | 2677 | 0.76 | 2010-07-08 | 2020-06-17 | 2018-09-07 | 1954 |
| last_review | 2677 | 0.76 | 2013-05-23 | 2020-06-17 | 2020-01-29 | 985 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| thumbnail_url | 11314 | 0 | NaN | : |
| medium_url | 11314 | 0 | NaN | : |
| xl_picture_url | 11314 | 0 | NaN | : |
| host_is_superhost | 1 | 1 | 0.38 | FAL: 7034, TRU: 4279 |
| host_has_profile_pic | 1 | 1 | 1.00 | TRU: 11287, FAL: 26 |
| host_identity_verified | 1 | 1 | 0.19 | FAL: 9217, TRU: 2096 |
| neighbourhood_group_cleansed | 11314 | 0 | NaN | : |
| is_location_exact | 0 | 1 | 0.92 | TRU: 10370, FAL: 944 |
| has_availability | 0 | 1 | 1.00 | TRU: 11314 |
| requires_license | 0 | 1 | 1.00 | TRU: 11314 |
| jurisdiction_names | 11314 | 0 | NaN | : |
| instant_bookable | 0 | 1 | 0.75 | TRU: 8528, FAL: 2786 |
| is_business_travel_ready | 0 | 1 | 0.00 | FAL: 11314 |
| require_guest_profile_picture | 0 | 1 | 0.01 | FAL: 11247, TRU: 67 |
| require_guest_phone_verification | 0 | 1 | 0.01 | FAL: 11201, TRU: 113 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 2.74e+07 | 1.09e+07 | 1.06e+04 | 2.01e+07 | 2.90e+07 | 3.60e+07 | 4.38e+07 | ▂▃▆▇▇ |
| scrape_id | 0 | 1.00 | 2.02e+13 | 0.00e+00 | 2.02e+13 | 2.02e+13 | 2.02e+13 | 2.02e+13 | 2.02e+13 | ▁▁▇▁▁ |
| host_id | 0 | 1.00 | 1.29e+08 | 9.79e+07 | 3.72e+04 | 3.65e+07 | 1.19e+08 | 2.12e+08 | 3.50e+08 | ▇▅▃▅▂ |
| host_listings_count | 1 | 1.00 | 1.68e+01 | 5.52e+01 | 0.00e+00 | 1.00e+00 | 2.00e+00 | 9.00e+00 | 1.12e+03 | ▇▁▁▁▁ |
| host_total_listings_count | 1 | 1.00 | 1.68e+01 | 5.52e+01 | 0.00e+00 | 1.00e+00 | 2.00e+00 | 9.00e+00 | 1.12e+03 | ▇▁▁▁▁ |
| latitude | 0 | 1.00 | 3.80e+01 | 1.00e-02 | 3.80e+01 | 3.80e+01 | 3.80e+01 | 3.80e+01 | 3.80e+01 | ▃▇▆▂▁ |
| longitude | 0 | 1.00 | 2.37e+01 | 1.00e-02 | 2.37e+01 | 2.37e+01 | 2.37e+01 | 2.37e+01 | 2.38e+01 | ▁▇▇▂▁ |
| accommodates | 0 | 1.00 | 3.90e+00 | 2.04e+00 | 1.00e+00 | 2.00e+00 | 4.00e+00 | 5.00e+00 | 3.00e+01 | ▇▁▁▁▁ |
| bathrooms | 1 | 1.00 | 1.21e+00 | 5.10e-01 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 1.00e+00 | 1.20e+01 | ▇▁▁▁▁ |
| bedrooms | 10 | 1.00 | 1.40e+00 | 8.60e-01 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 1.00e+01 | ▇▁▁▁▁ |
| beds | 46 | 1.00 | 2.14e+00 | 1.52e+00 | 0.00e+00 | 1.00e+00 | 2.00e+00 | 3.00e+00 | 4.00e+01 | ▇▁▁▁▁ |
| square_feet | 11226 | 0.01 | 6.68e+02 | 5.47e+02 | 0.00e+00 | 2.69e+02 | 5.92e+02 | 1.08e+03 | 2.79e+03 | ▇▅▃▁▁ |
| guests_included | 0 | 1.00 | 1.92e+00 | 1.25e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 2.00e+00 | 1.60e+01 | ▇▁▁▁▁ |
| minimum_nights | 0 | 1.00 | 4.26e+00 | 2.18e+01 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 2.00e+00 | 1.00e+03 | ▇▁▁▁▁ |
| maximum_nights | 0 | 1.00 | 1.63e+03 | 9.40e+04 | 1.00e+00 | 6.20e+01 | 1.12e+03 | 1.12e+03 | 1.00e+07 | ▇▁▁▁▁ |
| minimum_minimum_nights | 0 | 1.00 | 3.96e+00 | 1.89e+01 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 2.00e+00 | 1.00e+03 | ▇▁▁▁▁ |
| maximum_minimum_nights | 0 | 1.00 | 5.36e+00 | 3.31e+01 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 2.00e+00 | 1.00e+03 | ▇▁▁▁▁ |
| minimum_maximum_nights | 0 | 1.00 | 8.80e+02 | 4.54e+02 | 1.00e+00 | 1.00e+03 | 1.12e+03 | 1.12e+03 | 1.00e+04 | ▇▁▁▁▁ |
| maximum_maximum_nights | 0 | 1.00 | 8.90e+02 | 4.46e+02 | 1.00e+00 | 1.12e+03 | 1.12e+03 | 1.12e+03 | 1.00e+04 | ▇▁▁▁▁ |
| minimum_nights_avg_ntm | 0 | 1.00 | 4.37e+00 | 2.17e+01 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 2.00e+00 | 1.00e+03 | ▇▁▁▁▁ |
| maximum_nights_avg_ntm | 0 | 1.00 | 8.84e+02 | 4.48e+02 | 1.00e+00 | 1.03e+03 | 1.12e+03 | 1.12e+03 | 1.00e+04 | ▇▁▁▁▁ |
| availability_30 | 0 | 1.00 | 1.92e+01 | 1.22e+01 | 0.00e+00 | 4.00e+00 | 2.60e+01 | 3.00e+01 | 3.00e+01 | ▃▁▁▁▇ |
| availability_60 | 0 | 1.00 | 4.00e+01 | 2.35e+01 | 0.00e+00 | 1.80e+01 | 5.20e+01 | 5.90e+01 | 6.00e+01 | ▃▁▁▂▇ |
| availability_90 | 0 | 1.00 | 6.16e+01 | 3.45e+01 | 0.00e+00 | 4.10e+01 | 7.90e+01 | 8.90e+01 | 9.00e+01 | ▃▁▁▂▇ |
| availability_365 | 0 | 1.00 | 2.32e+02 | 1.33e+02 | 0.00e+00 | 1.21e+02 | 2.74e+02 | 3.58e+02 | 3.65e+02 | ▃▂▂▂▇ |
| number_of_reviews | 0 | 1.00 | 3.52e+01 | 6.12e+01 | 0.00e+00 | 1.00e+00 | 9.00e+00 | 4.20e+01 | 7.11e+02 | ▇▁▁▁▁ |
| number_of_reviews_ltm | 0 | 1.00 | 1.11e+01 | 1.71e+01 | 0.00e+00 | 0.00e+00 | 3.00e+00 | 1.60e+01 | 1.57e+02 | ▇▁▁▁▁ |
| review_scores_rating | 2739 | 0.76 | 9.53e+01 | 7.04e+00 | 2.00e+01 | 9.40e+01 | 9.70e+01 | 1.00e+02 | 1.00e+02 | ▁▁▁▁▇ |
| review_scores_accuracy | 2751 | 0.76 | 9.76e+00 | 6.80e-01 | 2.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_cleanliness | 2751 | 0.76 | 9.66e+00 | 7.40e-01 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_checkin | 2753 | 0.76 | 9.85e+00 | 5.50e-01 | 2.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_communication | 2752 | 0.76 | 9.84e+00 | 5.70e-01 | 2.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_location | 2753 | 0.76 | 9.57e+00 | 7.60e-01 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_value | 2754 | 0.76 | 9.62e+00 | 7.40e-01 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| calculated_host_listings_count | 0 | 1.00 | 9.98e+00 | 2.14e+01 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 7.00e+00 | 1.38e+02 | ▇▁▁▁▁ |
| calculated_host_listings_count_entire_homes | 0 | 1.00 | 7.57e+00 | 1.52e+01 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 6.00e+00 | 8.50e+01 | ▇▁▁▁▁ |
| calculated_host_listings_count_private_rooms | 0 | 1.00 | 2.14e+00 | 1.27e+01 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 1.12e+02 | ▇▁▁▁▁ |
| calculated_host_listings_count_shared_rooms | 0 | 1.00 | 5.00e-02 | 6.40e-01 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 1.20e+01 | ▇▁▁▁▁ |
| reviews_per_month | 2677 | 0.76 | 1.65e+00 | 1.74e+00 | 1.00e-02 | 3.40e-01 | 1.00e+00 | 2.42e+00 | 1.29e+01 | ▇▂▁▁▁ |
Based on our initial data analysis we identified 4 major types of variables in the underlying data set:
- Character values: 47
- Date values: 5
- Logical values: 15
- Numeric values: 39
We also have seen that we have 11,314 observations (apartments) & a total of 106 data points per apartment.
We identified many variables that have a characteristic which make it either not interesting to analyze (only one/ very few distinct values, text strings) or that we think we will not use in the analysis later on.
So we excluded these columns/ data points in order to make the data easier & faster to handle.
athens_data_red <- athens_data %>%
#Select the relevant variables
select(
id,
neighbourhood,
zipcode,
latitude,
longitude,
property_type,
room_type,
accommodates,
bathrooms,
bedrooms,
beds,
price,
weekly_price,
monthly_price,
security_deposit,
cleaning_fee,
guests_included,
extra_people,
minimum_nights,
maximum_nights,
availability_365,
number_of_reviews_ltm,
review_scores_rating,
review_scores_checkin,
review_scores_cleanliness,
review_scores_accuracy,
review_scores_communication,
review_scores_location,
review_scores_value,
cancellation_policy,
reviews_per_month,
host = host_id,
host_response_time,
host_response_rate,
host_acceptance_rate,
host_is_superhost,
host_listings_count,
host_total_listings_count,
host_identity_verified,number_of_reviews,
host_instant_booking = instant_bookable
)We now only have 41 columns left, which make the data set easier to handle.
In a next step we will adjust the type of some variables so that we can actually can work with the data more easily.
- We transform the price, weekly price, monthly price, security deposit, cleaning fee, extra people, host response rate and host acceptance rate from character variables to numeric ones
- We create factor variables for Property type, room types, cancellation policy and host response time
# Transform character values to numeric values
athens_data_clean <- athens_data_red %>%
mutate(
price = as.numeric(str_remove_all(price, "[$ ,]")),
weekly_price = as.numeric(str_remove_all(weekly_price, "[$ , ]")),
monthly_price = as.numeric(str_remove_all(monthly_price, "[$ ,]")),
cleaning_fee = as.numeric(str_remove_all(cleaning_fee, "[$ ,]")),
security_deposit = as.numeric(str_remove_all(security_deposit, "[$ ,]")),
extra_people = as.numeric(str_remove_all(extra_people, "[$ ,]")),
host_response_rate = as.numeric(str_remove_all(cleaning_fee, "[% ,]")),
host_acceptance_rate = as.numeric(str_remove_all(cleaning_fee, "[% ,]"))
)# Create factor variables for room types
room_types <- unique(athens_data_clean$room_type)
athens_data_clean$room_type <- factor(athens_data_clean$room_type, labels = room_types)
# Create factor variables for cancellation policies
cancellation_policies <- unique(athens_data_clean$cancellation_policy)
athens_data_clean$cancellation_policy <- factor(athens_data_clean$cancellation_policy, labels = cancellation_policies)
# Create factor variables for host response time
athens_data_clean <- athens_data_clean %>%
mutate(host_response_time = fct_relevel(host_response_time,
"within an hour",
"within a few hours",
"within a day",
"a few days or more"
))The issue with the property types is that there are to much in order to generate reasonable factors. We need to analyze how much the share of each category. Best case would be that the majority of the property type share is done with a small number. If that is the case we can just summarize the rest in a new category calles “other”.
# Identify the amount of each property type
most_com_properties <- athens_data_clean %>%
count(property_type) %>%
mutate(percentage = n/sum(n)*100)%>%
arrange(desc(n))
most_com_properties## # A tibble: 26 x 3
## property_type n percentage
## <chr> <int> <dbl>
## 1 Apartment 9677 85.5
## 2 House 386 3.41
## 3 Condominium 261 2.31
## 4 Serviced apartment 187 1.65
## 5 Loft 180 1.59
## 6 Aparthotel 139 1.23
## 7 Hotel 135 1.19
## 8 Boutique hotel 120 1.06
## 9 Bed and breakfast 49 0.433
## 10 Hostel 38 0.336
## # … with 16 more rows
As the 5 most common property types account for ~95% of the total share we can just focus on them and summarize the rest in “Others”
# First we need to summarize the other values in the Category "Others"
athens_data_clean <- athens_data_clean %>%
mutate(
property_type = case_when(
property_type %in% c("Apartment","House", "Condominium","Serviced Apartment", "Loft")
~ property_type,
TRUE ~ "Other"))
# In a next step we can make a factor out of the 6 pre-defined categories
athens_data_clean <- athens_data_clean %>%
mutate(
property_type = fct_relevel(property_type,
"Apartment",
"House",
"Condominium",
"Serviced Apartment",
"Loft",
"Other"))We now have transformed the data types of the most variables in order to make the data set even cleaner. We have deleted unnecessary values, adjusted wrong variable types and now we will further inspect the quality of our data.
In a this step we will further manipulate the data set. In specific we will correct the NA values in cases in which we can estimate the value.
- If no weekly price -> no discount -> we will insert the daily price multiplied by 7
- If no monthly price -> no discount -> we will insert the daily price multiplied by 30
- If no security deposit/ cleaning fee -> no fee -> we will insert 0
# We will replace the NAs in the weekly prices and assume there is no discount if NA
athens_data_clean$weekly_price[is.na(athens_data_clean$weekly_price)] <-
athens_data_clean$price *7
# We will replace the NAs in the monthly prices and assume there is no discount if NA
athens_data_clean$monthly_price[is.na(athens_data_clean$monthly_price)] <-
athens_data_clean$price * 30
# We will replace the NAs in the security deposit & cleaning fee and assume 0 if NA
athens_data_clean$cleaning_fee[is.na(athens_data_clean$cleaning_fee)] <- 0
athens_data_clean$security_deposit[is.na(athens_data_clean$security_deposit)] <- 0We now have cleaned the data to a nearly perfect amount. The only thing we haven’t yet included are outliers which will be captured in the next paragraph.
We will screen the most important variable price, which we need in our analysis later on, for potential outliers. We will exclude the extreme values, which make no sense economically (way too high prices). Reasons which could explain these extremly high prices are unwillingness to list at the moment, fake listings or extremly luxurious apartments.
# Quick plot to see outliers
athens_data_clean %>%
ggplot(aes(x = price)) +
geom_histogram() +
labs(title= "Distribution of prices in our original data")# Looks very scewed, probably a log-normal distribution, use log -> normal
athens_data_clean %>%
ggplot(aes(x = log(price))) +
geom_histogram()# There seem to be a few outliers. We will remove them using the IQR method, becauses we belive that keeping those values would skew our analysis
IQR.outliers <- function(x) {
Q3 <- quantile(x,0.95)
Q1 <- quantile(x,0.05)
IQR <- (Q3-Q1)
left <- (Q1-(1.5*IQR))
right <- (Q3+(1.5*IQR))
print(c(left, right))
c(x[x <left],x[x>right])
}
# Print outliers
IQR.outliers(athens_data_clean$price)## 5% 95%
## -180 352
## [1] 354 600 459 400 400 385 1000 515 410 640 412 502 650 400 400
## [16] 600 450 1000 1000 1000 495 1000 400 500 500 500 500 500 500 525
## [31] 404 800 700 500 402 450 450 540 360 810 487 7000 7000 7000 7000
## [46] 390 460 400 600 400 353 357 400 426 500 1500 900 1200 450 450
## [61] 800 400 990 600 1000 500 1000 5000 400 800 360 1000 390 500 500
## [76] 400 400 400 600 1290 999 1000 700 720 700 700 1000
athens_data_clean %>%
filter(!(price %in% IQR.outliers(athens_data_clean$price))) %>%
ggplot(aes(x = log(price))) +
geom_histogram()## 5% 95%
## -180 352
#Defining our final data set, which has no more outliers
athens_data_final <- athens_data_clean %>%
filter(!(price %in% IQR.outliers(athens_data_clean$price)))## 5% 95%
## -180 352
We finally derived our final data set with which we can start with the analysis part of the project. We reduced the relevant columns to 41, the relevant data points (without outliers) to 11,227 & cleaned readjusted many data types, values.
As we now have finally derived with a data set, which has only the relevant values, right variable types, adjusted NA values and is corrected for outliers, we can finally start with the analysis of the data.
How important is the location for the price? Are central locations more expensive?
# First we start with a simple plot, showing our Airbnbs
qmplot(longitude, latitude, data = athens_data_final, color = price)# Syntagma coordinates
syntagma <- c(37.975344, 23.73472)
names(syntagma) <- c("longitude", "latitude")
# Athene map
athens_map = get_map(location=c(23.68,
37.945,
23.8,
38.035), maptype="terrain-background")
athens_map <- ggmap(athens_map)
# We dont want to see the axis when we are ploting maps
map_theme <- theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
# Plot the map and Syntagma, is there a connection between prices and the centre?
athens_map +
geom_point(data=athens_data_final, aes(x = longitude, y = latitude, color = price)) +
geom_point(aes(x = syntagma['latitude'], syntagma['longitude']),
color = 'red', size = 5) +
map_theme +
labs(title="Airbnbs around the centre seem to be more expensive",
subtitle = "Centre - Syntagma Square")According to the graph there seems to be a connection between the price and the distance to the center. We will now try to calculate the distance to the center and try to see if the colors of the two graphs fit.
# Now that we assume that there is a connection, we calculate the distance for each airbnb
head(athens_data_final)## # A tibble: 6 x 41
## id neighbourhood zipcode latitude longitude property_type room_type
## <dbl> <chr> <chr> <dbl> <dbl> <fct> <fct>
## 1 10595 Ambelokipi 11526 38.0 23.8 Apartment Entire h…
## 2 10990 Ambelokipi 11526 38.0 23.8 Apartment Entire h…
## 3 10993 Ambelokipi 115 26 38.0 23.8 Apartment Entire h…
## 4 10995 Ambelokipi 11526 38.0 23.8 Apartment Entire h…
## 5 27262 Ambelokipi 11526 38.0 23.8 Apartment Entire h…
## 6 28186 Plaka 105 63 38.0 23.7 Loft Entire h…
## # … with 34 more variables: accommodates <dbl>, bathrooms <dbl>,
## # bedrooms <dbl>, beds <dbl>, price <dbl>, weekly_price <dbl>,
## # monthly_price <dbl>, security_deposit <dbl>, cleaning_fee <dbl>,
## # guests_included <dbl>, extra_people <dbl>, minimum_nights <dbl>,
## # maximum_nights <dbl>, availability_365 <dbl>, number_of_reviews_ltm <dbl>,
## # review_scores_rating <dbl>, review_scores_checkin <dbl>,
## # review_scores_cleanliness <dbl>, review_scores_accuracy <dbl>,
## # review_scores_communication <dbl>, review_scores_location <dbl>,
## # review_scores_value <dbl>, cancellation_policy <fct>,
## # reviews_per_month <dbl>, host <dbl>, host_response_time <fct>,
## # host_response_rate <dbl>, host_acceptance_rate <dbl>,
## # host_is_superhost <lgl>, host_listings_count <dbl>,
## # host_total_listings_count <dbl>, host_identity_verified <lgl>,
## # number_of_reviews <dbl>, host_instant_booking <lgl>
# Calculate the distance
athens_data_final<- athens_data_final %>%
rowwise() %>%
mutate(
cent_dist = distm(c(latitude, longitude), c(37.975344, 23.73472),
fun = distHaversine)[1,1]
)
# Test if our numbers are correct visually
athens_map +
geom_point(data=athens_data_final, aes(x = longitude, y = latitude, color = cent_dist)) +
geom_point(aes(x = syntagma['latitude'], syntagma['longitude']), color = 'red', size = 5) +
map_theme +
labs(title = "Locations seem to impact the prices of the airbnbs - same pattern as above" , subtitle = "Distance from center in meters")If we compare this graph to the graph above, we can indeed see that the same regions tend to have the same colors. Therefore we conclude that the location will indeed have an impact on the price of the airbnbs and that Airbnbs located in nearer in the center tend to have on average a higher price.
Next we are going to explore if based on location there are differences in room types. We expect to have center locations to have on average smaller offerings (e.g. shared rooms)
## [1] 4
avg_dist <- athens_data_final %>%
group_by(neighbourhood) %>%
summarise(
avg_dist = mean(cent_dist)
) %>%
arrange(-avg_dist)
athens_data_final %>%
filter(!is.na(neighbourhood)) %>%
select(neighbourhood,
room_type) %>%
group_by(neighbourhood,
room_type) %>%
summarise(n = n()) %>%
mutate(perc = n/sum(n)) %>%
ggplot(aes(fill=room_type, x=perc, y=factor(neighbourhood,levels = avg_dist$neighbourhood))) +
geom_bar(position="fill", stat="identity") +
labs(title="Average distance from the centre does not seem to impact room types",
subtitle = "Average distance in decreasing order") +
ylab("") +
xlab("") +
guides(fill=guide_legend(title="Room types"))We identified that there is no significance patterns visible. Our hypothesis that more central locations have a higher amount of shared rooms, private rooms than locations further away must therefore be invalid.
#First we want to see how the ratings are distributed in general
athens_data_final %>%
ggplot(aes(x=review_scores_rating)) +
geom_histogram() +
# Due to the high skew in distribution, a log y scale makes it easier to read
scale_y_log10() +
xlab("Review scores rating") +
ylab("Quantity") +
labs(title = "Most hosts seem to convince the tentants of their apartment", subtitle = "High negative skew in distribution")# We want to see if the response time has an influence on the general rating of the apartment
# create a bar chart to see the review scores based on response time
athens_data_final %>%
filter(host_response_time != "N/A" & !is.na(host_response_time)) %>%
group_by(host_response_time) %>%
ggplot(aes(y=host_response_time, x=review_scores_rating)) +
geom_boxplot() +
xlim(85,100) +
ylab("Host response time") +
xlab("Review Scores rating") +
labs(title = "Fast response time not valued enough to have impact on rating", subtitle = "The longer the response time the higher the median rating")We identified that the response time has not a huge impact on the general rating of the Airbnb. We will now try to identify more significant factors. Let’s try to test if the price per bed influences the rating.
# We want to see if the price is a significant factor for the rating
# In order to reduce the bias in the data we will use the price per bed
athens_data_final %>%
summarize(
price_per_bed = price/beds,
review_scores_rating
) %>% ggplot(aes(x=price_per_bed, y=review_scores_rating)) +
geom_point() +
scale_x_log10() +
ylim(60,100) +
xlab("Price per bed") +
ylab("Review Score Rating") +
labs(title = "No correlation between price per bed and review score",
subtitle = "Distribution of review scores and price per bed")Once again we cannot identify a clear trend in the data. They seems to be no correlation between the price per bed and the average rating. We will give the analysis one last try and explore if the rating is influenced by the fact if the host is a superhost (which has many responsibilites compared to a normal host) or not.
# Analysis if superhost status has a positive impact on the rating
athens_data_final %>%
filter(!is.na(host_is_superhost)) %>%
ggplot(aes(x=host_is_superhost, y=review_scores_rating)) +
geom_boxplot() +
ylim(60,100) +
xlab("Host is superhost?") +
ylab("Review Score Rating") +
labs(title = "Superhosts seem to make people happier during their stay",
subtitle = "Rating distribution based on Superhost criterion") Finally we found a relationship. In our eyes this makes completely sense - in order to receive a superhost rating you need to fulfill a lot of requirements (e.g. you are not allowed to cancel as soon as you have accepted hosts & you need to have specific response times etc.). Therefore, the superhost variable includes a lot of positive attributes, which kind of explains that people feel that stays in their apartments worked out particulariy well. Many of them also do this professionally and therefore value reputation a lot.
We were quite surprised that neither the price per bed nor the response time of the host (which we have seen as an indicator of the commitment from host side) played a major role in the overall rating. We came up with possible explainations. We think that the price has no impact as people book apartments based on their individual price preferences and then rate the stay according to their experiences. Therefore the price criterion is outweighted by other factors. Regarding the host response time, we concluded that this variable probably doesn’t reflect the commitment of the host in an ideal way. There are many more factors, which are not included - therefore the general impact of the response time is too low to see any impact.
We haven’t yet analyzed the room type. However, we have the hypothesis that the room type will impact the price which can be achieved with an apartment.
Is there a difference in price among room types?
# create a plot to show the density and distribution for the price grouped by each room type
athens_data_final %>%
ggplot(aes(x=price, y=room_type, fill=room_type)) +
geom_violin( ) +
# make differences more visible in relevant interval
xlim(0,250) +
# In order to make differences more visible
scale_x_log10() +
xlab("Price") +
ylab("Density") +
stat_summary(fun.y=median, geom="point", size=3, color="black") +
labs(title = "Private rooms with highest median prices, closely followed by whole apartments",
subtitle = "Distribution of price per room type") +
theme(strip.text.x = element_text(size = 10), legend.position = "none")First we were quite confused that private rooms are on average more expensive than the apartments. However after having a look of the quanitity of the room types we identified that apartments are way more common than shared rooms. As the overall data quantity is so little compared to apartments, it’s likely that outliers adjust the price upwards. It makes sense that shared rooms are really cheap, in the rante between 10 and 30 Euro per night.
We will now conduct the same analysis but adjust (like above) the price by the amount of persons the apartment can carry. We expect the results to be more equally distributed.
# create a plot to show the density and distribution for the price per person grouped by each room type
athens_data_final %>%
ggplot(aes(x=price/accommodates, y=room_type, fill=room_type)) +
geom_violin() +
# make differences more visible in relevant interval
xlim(0,250) +
# In order to make differences more visible
scale_x_log10() +
xlab("Price") +
ylab("Density") +
stat_summary(fun.y=median, geom="point", size=3, color="black") +
labs(title = "Differences in prices per person smaller between apartment types",
subtitle = "Distribution of price per person per room type") +
theme(strip.text.x = element_text(size = 10), legend.position = "none") > We saw that although the total room price for apartments is higher than the one for shared rooms & hotel rooms in total, the price difference is smaller if you account for the number of accomodates which can be fit in one apartment. Now the median price per person is nearly identical among these 3 categories. Private rooms are still an outlier, but we think it is due to the same reasoning as above.
We will try to find the best fitting model to predict per night prices
athens_data_final %>%
na.omit() %>%
select_if(is.numeric) %>%
cor() %>%
as.data.frame() %>%
select(price) %>%
add_rownames(var = "variable") %>%
arrange(price) %>%
ggplot(aes(x = price, y = reorder(variable, price))) +
geom_col() +
ylab("") +
xlab("Correlation") +
labs(title = "Distance from central is the most negative correlation",
subtitle = "Correlations with price")athens_data_final %>%
select(cent_dist, price, accommodates, bedrooms, bathrooms, host_is_superhost, beds, cleaning_fee) %>%
na.omit() %>%
cor() %>%
round(2) %>%
melt() %>%
mutate(
Var1 = case_when(
Var1 == "cent_dist" ~ "Distance from centre",
Var1 == "price" ~ "Price",
Var1 == "accommodates" ~ "Accommodation",
Var1 == "bedrooms" ~ "Number of bedrooms",
Var1 == "bathrooms" ~ "Number of bathrooms",
Var1 == "host_is_superhost" ~ "Superhost",
Var1 == "beds" ~ "Number of beds",
Var1 == "cleaning_fee" ~ "Cleaning fee"),
Var2 = case_when(
Var2 == "cent_dist" ~ "Distance from centre",
Var2 == "price" ~ "Price",
Var2 == "accommodates" ~ "Accommodation",
Var2 == "bedrooms" ~ "Number of bedrooms",
Var2 == "bathrooms" ~ "Number of bathrooms",
Var2 == "host_is_superhost" ~ "Superhost",
Var2 == "beds" ~ "Number of beds",
Var2 == "cleaning_fee" ~ "Cleaning fee")
)%>%
ggplot(aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = lbs_blue, high = lbs_pink, mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
coord_fixed() +
labs(title="Bathrooms, bedrooms, accomedation \nand price all positive correlation",
subtitle="Correlation accross variables") +
xlab("") +
ylab("")Using correlations doesnt seem to work too well, we will need to find another way
# First we will split our data into a training and testing set
# Set seed so we will get the same results
set.seed(202019)
size <- floor(0.75 * nrow(athens_data_final))
train_ind <- sample(seq_len(nrow(athens_data_final)), size = size)
train <- athens_data_final[train_ind, ] %>% na.omit()
test <- athens_data_final[-train_ind, ] %>% na.omit()OLS
library(stats)
# To choose a model we will use Akaike's information criterion
# Univariate regression
# Model 1
model1 <- lm(log(price) ~ as.factor(accommodates),
data=na.omit(train))
# Are airbnbs that accomodate 8 people necessarily 2 times as expensive? We do not think so, therefore we use factors instead.
summary(model1) ##
## Call:
## lm(formula = log(price) ~ as.factor(accommodates), data = na.omit(train))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2346 -0.3189 -0.0343 0.2689 2.1660
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.9060 0.0594 48.89 <2e-16 ***
## as.factor(accommodates)2 0.6318 0.0608 10.40 <2e-16 ***
## as.factor(accommodates)3 0.7659 0.0618 12.40 <2e-16 ***
## as.factor(accommodates)4 0.9791 0.0605 16.20 <2e-16 ***
## as.factor(accommodates)5 1.0913 0.0629 17.34 <2e-16 ***
## as.factor(accommodates)6 1.2606 0.0623 20.22 <2e-16 ***
## as.factor(accommodates)7 1.3722 0.0737 18.61 <2e-16 ***
## as.factor(accommodates)8 1.5456 0.0708 21.82 <2e-16 ***
## as.factor(accommodates)9 1.7469 0.0995 17.56 <2e-16 ***
## as.factor(accommodates)10 1.6034 0.0969 16.54 <2e-16 ***
## as.factor(accommodates)11 1.4644 0.1681 8.71 <2e-16 ***
## as.factor(accommodates)12 1.8935 0.1168 16.21 <2e-16 ***
## as.factor(accommodates)13 1.1504 0.1880 6.12 1e-09 ***
## as.factor(accommodates)14 2.0423 0.1681 12.15 <2e-16 ***
## as.factor(accommodates)15 2.1075 0.2192 9.61 <2e-16 ***
## as.factor(accommodates)16 2.3186 0.1189 19.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.472 on 5645 degrees of freedom
## Multiple R-squared: 0.28, Adjusted R-squared: 0.278
## F-statistic: 147 on 15 and 5645 DF, p-value: <2e-16
## [1] 0.28
## [1] 7579
# Multivariate Regression
# Judging by the correlations we can predict which variables might have a bigger impact, now we will use how many people the airbnb accomodates and how many bedrooms there are
# Model 2
model2 <- lm(log(price) ~ as.factor(accommodates) + bedrooms,
data=na.omit(train))
summary(model2) ##
## Call:
## lm(formula = log(price) ~ as.factor(accommodates) + bedrooms,
## data = na.omit(train))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9781 -0.3034 -0.0383 0.2529 2.1500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8133 0.0602 46.77 < 2e-16 ***
## as.factor(accommodates)2 0.6415 0.0604 10.62 < 2e-16 ***
## as.factor(accommodates)3 0.7636 0.0614 12.44 < 2e-16 ***
## as.factor(accommodates)4 0.9464 0.0602 15.71 < 2e-16 ***
## as.factor(accommodates)5 0.9970 0.0636 15.68 < 2e-16 ***
## as.factor(accommodates)6 1.1434 0.0636 17.99 < 2e-16 ***
## as.factor(accommodates)7 1.2280 0.0753 16.30 < 2e-16 ***
## as.factor(accommodates)8 1.3598 0.0739 18.39 < 2e-16 ***
## as.factor(accommodates)9 1.5171 0.1027 14.77 < 2e-16 ***
## as.factor(accommodates)10 1.3470 0.1012 13.31 < 2e-16 ***
## as.factor(accommodates)11 1.2490 0.1692 7.38 1.8e-13 ***
## as.factor(accommodates)12 1.6307 0.1204 13.54 < 2e-16 ***
## as.factor(accommodates)13 0.9178 0.1890 4.86 1.2e-06 ***
## as.factor(accommodates)14 1.7060 0.1720 9.92 < 2e-16 ***
## as.factor(accommodates)15 1.6854 0.2238 7.53 5.9e-14 ***
## as.factor(accommodates)16 1.9493 0.1264 15.43 < 2e-16 ***
## bedrooms 0.0990 0.0120 8.26 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.469 on 5644 degrees of freedom
## Multiple R-squared: 0.289, Adjusted R-squared: 0.287
## F-statistic: 143 on 16 and 5644 DF, p-value: <2e-16
## [1] 0.289
## [1] 7513
# Both the r2 and the AIC is smaller with this model, which means that the previous one would be prefered
model2 <- lm(log(price) ~ as.factor(accommodates) + cent_dist, data=na.omit(train))
summary(model2) ##
## Call:
## lm(formula = log(price) ~ as.factor(accommodates) + cent_dist,
## data = na.omit(train))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.173 -0.293 -0.024 0.251 2.076
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.27e+00 5.75e-02 56.93 < 2e-16 ***
## as.factor(accommodates)2 5.79e-01 5.72e-02 10.13 < 2e-16 ***
## as.factor(accommodates)3 7.25e-01 5.81e-02 12.49 < 2e-16 ***
## as.factor(accommodates)4 9.27e-01 5.69e-02 16.30 < 2e-16 ***
## as.factor(accommodates)5 1.03e+00 5.92e-02 17.47 < 2e-16 ***
## as.factor(accommodates)6 1.19e+00 5.87e-02 20.22 < 2e-16 ***
## as.factor(accommodates)7 1.27e+00 6.94e-02 18.30 < 2e-16 ***
## as.factor(accommodates)8 1.47e+00 6.66e-02 22.10 < 2e-16 ***
## as.factor(accommodates)9 1.59e+00 9.37e-02 16.97 < 2e-16 ***
## as.factor(accommodates)10 1.53e+00 9.11e-02 16.78 < 2e-16 ***
## as.factor(accommodates)11 1.27e+00 1.58e-01 8.01 1.4e-15 ***
## as.factor(accommodates)12 1.85e+00 1.10e-01 16.87 < 2e-16 ***
## as.factor(accommodates)13 1.07e+00 1.77e-01 6.04 1.6e-09 ***
## as.factor(accommodates)14 1.96e+00 1.58e-01 12.40 < 2e-16 ***
## as.factor(accommodates)15 2.02e+00 2.06e-01 9.82 < 2e-16 ***
## as.factor(accommodates)16 2.23e+00 1.12e-01 19.96 < 2e-16 ***
## cent_dist -1.75e-04 6.43e-06 -27.30 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.444 on 5644 degrees of freedom
## Multiple R-squared: 0.364, Adjusted R-squared: 0.362
## F-statistic: 202 on 16 and 5644 DF, p-value: <2e-16
## [1] 0.364
## [1] 6879
# Our r2 is much better now, and our Akaike criterion also droped by quite a big margin. This is likely due to the fact, that the distance from the center is a big factor when people price airbnbs
# Model 3
model3 <- lm(log(price) ~ as.factor(accommodates) + cent_dist + room_type,
data=na.omit(train))
summary(model3) ##
## Call:
## lm(formula = log(price) ~ as.factor(accommodates) + cent_dist +
## room_type, data = na.omit(train))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9693 -0.2869 -0.0342 0.2397 2.0081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.65e+00 6.02e-02 60.51 < 2e-16 ***
## as.factor(accommodates)2 2.68e-01 5.88e-02 4.57 5.0e-06 ***
## as.factor(accommodates)3 3.55e-01 6.07e-02 5.85 5.3e-09 ***
## as.factor(accommodates)4 5.52e-01 5.96e-02 9.26 < 2e-16 ***
## as.factor(accommodates)5 6.58e-01 6.18e-02 10.64 < 2e-16 ***
## as.factor(accommodates)6 8.11e-01 6.13e-02 13.24 < 2e-16 ***
## as.factor(accommodates)7 8.95e-01 7.12e-02 12.57 < 2e-16 ***
## as.factor(accommodates)8 1.10e+00 6.85e-02 16.07 < 2e-16 ***
## as.factor(accommodates)9 1.21e+00 9.40e-02 12.90 < 2e-16 ***
## as.factor(accommodates)10 1.15e+00 9.16e-02 12.51 < 2e-16 ***
## as.factor(accommodates)11 8.90e-01 1.56e-01 5.71 1.2e-08 ***
## as.factor(accommodates)12 1.51e+00 1.09e-01 13.90 < 2e-16 ***
## as.factor(accommodates)13 6.89e-01 1.74e-01 3.97 7.3e-05 ***
## as.factor(accommodates)14 1.58e+00 1.56e-01 10.16 < 2e-16 ***
## as.factor(accommodates)15 1.64e+00 2.02e-01 8.14 4.7e-16 ***
## as.factor(accommodates)16 1.91e+00 1.11e-01 17.24 < 2e-16 ***
## cent_dist -1.73e-04 6.27e-06 -27.53 < 2e-16 ***
## room_typePrivate room 1.67e-01 5.50e-02 3.03 0.0025 **
## room_typeHotel room -3.94e-01 2.64e-02 -14.93 < 2e-16 ***
## room_typeShared room -8.53e-01 8.53e-02 -10.01 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.432 on 5641 degrees of freedom
## Multiple R-squared: 0.397, Adjusted R-squared: 0.395
## F-statistic: 196 on 19 and 5641 DF, p-value: <2e-16
## [1] 0.397
## [1] 6582
# Room types will impact prices, as people would pay a premium for better acommendation, threfore with the room types we could improve our model also.
# Model 4
model4 <- lm(log(price) ~ as.factor(accommodates) + room_type + bedrooms + bathrooms + cent_dist,
data=na.omit(train))
summary(model4) ##
## Call:
## lm(formula = log(price) ~ as.factor(accommodates) + room_type +
## bedrooms + bathrooms + cent_dist, data = na.omit(train))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6652 -0.2814 -0.0249 0.2390 1.9998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.33e+00 6.14e-02 54.31 < 2e-16 ***
## as.factor(accommodates)2 2.95e-01 5.72e-02 5.16 2.5e-07 ***
## as.factor(accommodates)3 3.66e-01 5.91e-02 6.19 6.5e-10 ***
## as.factor(accommodates)4 5.26e-01 5.82e-02 9.03 < 2e-16 ***
## as.factor(accommodates)5 5.54e-01 6.12e-02 9.06 < 2e-16 ***
## as.factor(accommodates)6 6.50e-01 6.12e-02 10.62 < 2e-16 ***
## as.factor(accommodates)7 6.65e-01 7.13e-02 9.33 < 2e-16 ***
## as.factor(accommodates)8 7.87e-01 7.01e-02 11.23 < 2e-16 ***
## as.factor(accommodates)9 8.11e-01 9.53e-02 8.51 < 2e-16 ***
## as.factor(accommodates)10 6.86e-01 9.40e-02 7.30 3.4e-13 ***
## as.factor(accommodates)11 5.73e-01 1.54e-01 3.73 0.00019 ***
## as.factor(accommodates)12 1.02e+00 1.10e-01 9.24 < 2e-16 ***
## as.factor(accommodates)13 3.00e-01 1.71e-01 1.75 0.07962 .
## as.factor(accommodates)14 9.07e-01 1.57e-01 5.78 7.8e-09 ***
## as.factor(accommodates)15 9.22e-01 2.02e-01 4.55 5.4e-06 ***
## as.factor(accommodates)16 1.18e+00 1.17e-01 10.14 < 2e-16 ***
## room_typePrivate room 1.02e-01 5.38e-02 1.90 0.05695 .
## room_typeHotel room -4.50e-01 2.59e-02 -17.38 < 2e-16 ***
## room_typeShared room -9.15e-01 8.32e-02 -11.00 < 2e-16 ***
## bedrooms 8.98e-02 1.12e-02 8.00 1.5e-15 ***
## bathrooms 2.02e-01 1.55e-02 13.01 < 2e-16 ***
## cent_dist -1.72e-04 6.14e-06 -27.95 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.42 on 5639 degrees of freedom
## Multiple R-squared: 0.429, Adjusted R-squared: 0.427
## F-statistic: 202 on 21 and 5639 DF, p-value: <2e-16
## [1] 0.429
## [1] 6278
# In this model we try to implement our distance variable, and more information about the flats. Although our model has higher R2 and AIC, it did not have a big effect
# Model 5
model5 <- lm(log(price) ~ as.factor(accommodates) + room_type + bedrooms + bathrooms + cent_dist + as.factor(neighbourhood) * cent_dist,
data=na.omit(train))
summary(model5) ##
## Call:
## lm(formula = log(price) ~ as.factor(accommodates) + room_type +
## bedrooms + bathrooms + cent_dist + as.factor(neighbourhood) *
## cent_dist, data = na.omit(train))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5906 -0.2475 -0.0191 0.2165 2.0084
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error
## (Intercept) 1.95e+00 7.08e-01
## as.factor(accommodates)2 2.27e-01 5.41e-02
## as.factor(accommodates)3 3.04e-01 5.58e-02
## as.factor(accommodates)4 4.45e-01 5.51e-02
## as.factor(accommodates)5 4.79e-01 5.79e-02
## as.factor(accommodates)6 5.55e-01 5.79e-02
## as.factor(accommodates)7 5.60e-01 6.73e-02
## as.factor(accommodates)8 7.11e-01 6.61e-02
## as.factor(accommodates)9 7.15e-01 8.97e-02
## as.factor(accommodates)10 6.02e-01 8.86e-02
## as.factor(accommodates)11 4.84e-01 1.44e-01
## as.factor(accommodates)12 8.84e-01 1.04e-01
## as.factor(accommodates)13 2.10e-01 1.61e-01
## as.factor(accommodates)14 8.26e-01 1.48e-01
## as.factor(accommodates)15 9.08e-01 1.90e-01
## as.factor(accommodates)16 1.05e+00 1.10e-01
## room_typePrivate room 1.03e-01 5.16e-02
## room_typeHotel room -3.97e-01 2.46e-02
## room_typeShared room -1.00e+00 7.99e-02
## bedrooms 1.10e-01 1.07e-02
## bathrooms 1.83e-01 1.47e-02
## cent_dist 1.91e-04 1.73e-04
## as.factor(neighbourhood)Agios Nikolaos 5.78e-01 8.57e-01
## as.factor(neighbourhood)Akadimia Platonos 2.76e+00 9.21e-01
## as.factor(neighbourhood)Ambelokipi 9.96e-01 7.15e-01
## as.factor(neighbourhood)Attiki 1.10e+00 7.75e-01
## as.factor(neighbourhood)Exarcheia 1.00e+00 7.18e-01
## as.factor(neighbourhood)Gazi 1.32e-01 1.07e+00
## as.factor(neighbourhood)Goudi -1.46e+00 2.90e+00
## as.factor(neighbourhood)Ilisia 1.48e+00 7.96e-01
## as.factor(neighbourhood)Kerameikos 1.53e+00 7.26e-01
## as.factor(neighbourhood)Kolonaki 1.38e+00 7.08e-01
## as.factor(neighbourhood)Kolonos 1.42e+00 8.14e-01
## as.factor(neighbourhood)Koukaki 1.61e+00 7.11e-01
## as.factor(neighbourhood)Kypseli 6.11e-01 7.61e-01
## as.factor(neighbourhood)Larissis 9.45e-01 7.25e-01
## as.factor(neighbourhood)Metaxourgeio 7.19e-01 7.21e-01
## as.factor(neighbourhood)Mets 1.59e+00 7.19e-01
## as.factor(neighbourhood)Monastiraki 1.31e+00 8.99e-01
## as.factor(neighbourhood)Neapoli 9.11e-01 7.30e-01
## as.factor(neighbourhood)Neos Kosmos 1.58e+00 7.09e-01
## as.factor(neighbourhood)Pangrati 1.21e+00 7.12e-01
## as.factor(neighbourhood)Patisia -1.15e-01 8.13e-01
## as.factor(neighbourhood)Pedion Areos 1.70e+00 7.63e-01
## as.factor(neighbourhood)Petralona 1.47e+00 7.56e-01
## as.factor(neighbourhood)Plaka 1.59e+00 7.07e-01
## as.factor(neighbourhood)Profitis Daniil 2.21e+00 1.01e+00
## as.factor(neighbourhood)Psyri 8.93e-01 7.27e-01
## as.factor(neighbourhood)Rizoupoli -5.53e-01 2.65e+00
## as.factor(neighbourhood)Rouf -1.44e-02 4.14e-01
## as.factor(neighbourhood)Sepolia 2.85e+00 2.66e+00
## as.factor(neighbourhood)Thiseio 2.27e+00 7.74e-01
## as.factor(neighbourhood)Votanikos 8.60e-01 1.47e+00
## cent_dist:as.factor(neighbourhood)Agios Nikolaos -1.30e-04 2.33e-04
## cent_dist:as.factor(neighbourhood)Akadimia Platonos -7.26e-04 2.59e-04
## cent_dist:as.factor(neighbourhood)Ambelokipi -1.87e-04 1.76e-04
## cent_dist:as.factor(neighbourhood)Attiki -3.05e-04 2.14e-04
## cent_dist:as.factor(neighbourhood)Exarcheia -2.05e-04 2.00e-04
## cent_dist:as.factor(neighbourhood)Gazi 1.98e-04 3.53e-04
## cent_dist:as.factor(neighbourhood)Goudi 4.76e-04 7.45e-04
## cent_dist:as.factor(neighbourhood)Ilisia -3.50e-04 2.36e-04
## cent_dist:as.factor(neighbourhood)Kerameikos -3.26e-04 1.87e-04
## cent_dist:as.factor(neighbourhood)Kolonaki -2.27e-04 1.80e-04
## cent_dist:as.factor(neighbourhood)Kolonos -3.69e-04 2.27e-04
## cent_dist:as.factor(neighbourhood)Koukaki -4.52e-04 1.79e-04
## cent_dist:as.factor(neighbourhood)Kypseli -1.20e-04 1.99e-04
## cent_dist:as.factor(neighbourhood)Larissis -2.06e-04 1.93e-04
## cent_dist:as.factor(neighbourhood)Metaxourgeio 3.51e-05 1.89e-04
## cent_dist:as.factor(neighbourhood)Mets -5.44e-04 2.03e-04
## cent_dist:as.factor(neighbourhood)Monastiraki -2.49e-06 5.30e-04
## cent_dist:as.factor(neighbourhood)Neapoli -6.33e-05 2.12e-04
## cent_dist:as.factor(neighbourhood)Neos Kosmos -4.75e-04 1.76e-04
## cent_dist:as.factor(neighbourhood)Pangrati -3.02e-04 1.82e-04
## cent_dist:as.factor(neighbourhood)Patisia 4.67e-05 1.99e-04
## cent_dist:as.factor(neighbourhood)Pedion Areos -5.38e-04 2.15e-04
## cent_dist:as.factor(neighbourhood)Petralona -3.41e-04 1.96e-04
## cent_dist:as.factor(neighbourhood)Plaka -2.80e-04 1.82e-04
## cent_dist:as.factor(neighbourhood)Profitis Daniil -4.10e-04 2.99e-04
## cent_dist:as.factor(neighbourhood)Psyri 1.47e-04 2.18e-04
## cent_dist:as.factor(neighbourhood)Rizoupoli 7.87e-05 5.21e-04
## cent_dist:as.factor(neighbourhood)Rouf NA NA
## cent_dist:as.factor(neighbourhood)Sepolia -7.68e-04 7.35e-04
## cent_dist:as.factor(neighbourhood)Thiseio -6.74e-04 2.30e-04
## cent_dist:as.factor(neighbourhood)Votanikos -7.58e-05 4.72e-04
## t value Pr(>|t|)
## (Intercept) 2.76 0.00589 **
## as.factor(accommodates)2 4.20 2.8e-05 ***
## as.factor(accommodates)3 5.44 5.5e-08 ***
## as.factor(accommodates)4 8.09 7.5e-16 ***
## as.factor(accommodates)5 8.27 < 2e-16 ***
## as.factor(accommodates)6 9.58 < 2e-16 ***
## as.factor(accommodates)7 8.33 < 2e-16 ***
## as.factor(accommodates)8 10.75 < 2e-16 ***
## as.factor(accommodates)9 7.96 2.0e-15 ***
## as.factor(accommodates)10 6.79 1.2e-11 ***
## as.factor(accommodates)11 3.35 0.00081 ***
## as.factor(accommodates)12 8.49 < 2e-16 ***
## as.factor(accommodates)13 1.30 0.19200
## as.factor(accommodates)14 5.60 2.3e-08 ***
## as.factor(accommodates)15 4.78 1.8e-06 ***
## as.factor(accommodates)16 9.49 < 2e-16 ***
## room_typePrivate room 2.00 0.04546 *
## room_typeHotel room -16.16 < 2e-16 ***
## room_typeShared room -12.56 < 2e-16 ***
## bedrooms 10.29 < 2e-16 ***
## bathrooms 12.46 < 2e-16 ***
## cent_dist 1.10 0.26997
## as.factor(neighbourhood)Agios Nikolaos 0.67 0.49995
## as.factor(neighbourhood)Akadimia Platonos 3.00 0.00275 **
## as.factor(neighbourhood)Ambelokipi 1.39 0.16346
## as.factor(neighbourhood)Attiki 1.42 0.15687
## as.factor(neighbourhood)Exarcheia 1.40 0.16200
## as.factor(neighbourhood)Gazi 0.12 0.90213
## as.factor(neighbourhood)Goudi -0.50 0.61443
## as.factor(neighbourhood)Ilisia 1.86 0.06244 .
## as.factor(neighbourhood)Kerameikos 2.11 0.03468 *
## as.factor(neighbourhood)Kolonaki 1.95 0.05139 .
## as.factor(neighbourhood)Kolonos 1.74 0.08200 .
## as.factor(neighbourhood)Koukaki 2.26 0.02391 *
## as.factor(neighbourhood)Kypseli 0.80 0.42216
## as.factor(neighbourhood)Larissis 1.30 0.19240
## as.factor(neighbourhood)Metaxourgeio 1.00 0.31827
## as.factor(neighbourhood)Mets 2.21 0.02716 *
## as.factor(neighbourhood)Monastiraki 1.46 0.14380
## as.factor(neighbourhood)Neapoli 1.25 0.21156
## as.factor(neighbourhood)Neos Kosmos 2.23 0.02611 *
## as.factor(neighbourhood)Pangrati 1.70 0.08825 .
## as.factor(neighbourhood)Patisia -0.14 0.88801
## as.factor(neighbourhood)Pedion Areos 2.23 0.02578 *
## as.factor(neighbourhood)Petralona 1.95 0.05165 .
## as.factor(neighbourhood)Plaka 2.25 0.02441 *
## as.factor(neighbourhood)Profitis Daniil 2.19 0.02863 *
## as.factor(neighbourhood)Psyri 1.23 0.21909
## as.factor(neighbourhood)Rizoupoli -0.21 0.83448
## as.factor(neighbourhood)Rouf -0.03 0.97219
## as.factor(neighbourhood)Sepolia 1.07 0.28411
## as.factor(neighbourhood)Thiseio 2.93 0.00337 **
## as.factor(neighbourhood)Votanikos 0.58 0.55986
## cent_dist:as.factor(neighbourhood)Agios Nikolaos -0.56 0.57543
## cent_dist:as.factor(neighbourhood)Akadimia Platonos -2.80 0.00515 **
## cent_dist:as.factor(neighbourhood)Ambelokipi -1.07 0.28679
## cent_dist:as.factor(neighbourhood)Attiki -1.43 0.15379
## cent_dist:as.factor(neighbourhood)Exarcheia -1.03 0.30514
## cent_dist:as.factor(neighbourhood)Gazi 0.56 0.57380
## cent_dist:as.factor(neighbourhood)Goudi 0.64 0.52324
## cent_dist:as.factor(neighbourhood)Ilisia -1.49 0.13749
## cent_dist:as.factor(neighbourhood)Kerameikos -1.74 0.08201 .
## cent_dist:as.factor(neighbourhood)Kolonaki -1.26 0.20687
## cent_dist:as.factor(neighbourhood)Kolonos -1.63 0.10391
## cent_dist:as.factor(neighbourhood)Koukaki -2.53 0.01149 *
## cent_dist:as.factor(neighbourhood)Kypseli -0.60 0.54823
## cent_dist:as.factor(neighbourhood)Larissis -1.07 0.28465
## cent_dist:as.factor(neighbourhood)Metaxourgeio 0.19 0.85282
## cent_dist:as.factor(neighbourhood)Mets -2.68 0.00747 **
## cent_dist:as.factor(neighbourhood)Monastiraki 0.00 0.99626
## cent_dist:as.factor(neighbourhood)Neapoli -0.30 0.76545
## cent_dist:as.factor(neighbourhood)Neos Kosmos -2.69 0.00712 **
## cent_dist:as.factor(neighbourhood)Pangrati -1.66 0.09645 .
## cent_dist:as.factor(neighbourhood)Patisia 0.23 0.81454
## cent_dist:as.factor(neighbourhood)Pedion Areos -2.50 0.01257 *
## cent_dist:as.factor(neighbourhood)Petralona -1.74 0.08142 .
## cent_dist:as.factor(neighbourhood)Plaka -1.54 0.12345
## cent_dist:as.factor(neighbourhood)Profitis Daniil -1.37 0.16994
## cent_dist:as.factor(neighbourhood)Psyri 0.68 0.49881
## cent_dist:as.factor(neighbourhood)Rizoupoli 0.15 0.87980
## cent_dist:as.factor(neighbourhood)Rouf NA NA
## cent_dist:as.factor(neighbourhood)Sepolia -1.04 0.29613
## cent_dist:as.factor(neighbourhood)Thiseio -2.92 0.00347 **
## cent_dist:as.factor(neighbourhood)Votanikos -0.16 0.87249
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.393 on 5578 degrees of freedom
## Multiple R-squared: 0.507, Adjusted R-squared: 0.499
## F-statistic: 69.9 on 82 and 5578 DF, p-value: <2e-16
## [1] 0.507
## [1] 5575
# With the interaction between the distance and neigbourhood we achived our biggest improvement yet. Distance is important during flat hunting, but the neighbourhood also plays a huge role.
# Model 6
# In our final model we use 2 interactions. One for the distance, which we correct with neighbourhoods, and one for the reviews, where we try to weight the rating and frequency, giving a proxy for the demand of that airbnb.
model6 <- lm(log(price) ~ as.factor(accommodates) + room_type + bedrooms + bathrooms + as.factor(neighbourhood) * cent_dist + review_scores_rating * reviews_per_month,
data=na.omit(train))
summary(model6) ##
## Call:
## lm(formula = log(price) ~ as.factor(accommodates) + room_type +
## bedrooms + bathrooms + as.factor(neighbourhood) * cent_dist +
## review_scores_rating * reviews_per_month, data = na.omit(train))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6548 -0.2403 -0.0173 0.2100 1.8770
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error
## (Intercept) 1.78e+00 6.84e-01
## as.factor(accommodates)2 2.54e-01 5.18e-02
## as.factor(accommodates)3 3.10e-01 5.35e-02
## as.factor(accommodates)4 4.42e-01 5.28e-02
## as.factor(accommodates)5 4.79e-01 5.55e-02
## as.factor(accommodates)6 5.64e-01 5.55e-02
## as.factor(accommodates)7 6.10e-01 6.45e-02
## as.factor(accommodates)8 7.18e-01 6.34e-02
## as.factor(accommodates)9 7.77e-01 8.60e-02
## as.factor(accommodates)10 6.61e-01 8.50e-02
## as.factor(accommodates)11 6.29e-01 1.38e-01
## as.factor(accommodates)12 8.71e-01 9.97e-02
## as.factor(accommodates)13 2.82e-01 1.54e-01
## as.factor(accommodates)14 8.44e-01 1.41e-01
## as.factor(accommodates)15 9.39e-01 1.82e-01
## as.factor(accommodates)16 1.03e+00 1.06e-01
## room_typePrivate room 1.03e-01 4.96e-02
## room_typeHotel room -4.00e-01 2.36e-02
## room_typeShared room -1.03e+00 7.66e-02
## bedrooms 9.93e-02 1.03e-02
## bathrooms 1.83e-01 1.41e-02
## as.factor(neighbourhood)Agios Nikolaos 3.60e-01 8.21e-01
## as.factor(neighbourhood)Akadimia Platonos 3.08e+00 8.82e-01
## as.factor(neighbourhood)Ambelokipi 1.05e+00 6.85e-01
## as.factor(neighbourhood)Attiki 1.11e+00 7.43e-01
## as.factor(neighbourhood)Exarcheia 1.13e+00 6.88e-01
## as.factor(neighbourhood)Gazi 1.80e-01 1.03e+00
## as.factor(neighbourhood)Goudi -1.72e+00 2.78e+00
## as.factor(neighbourhood)Ilisia 1.58e+00 7.63e-01
## as.factor(neighbourhood)Kerameikos 1.68e+00 6.95e-01
## as.factor(neighbourhood)Kolonaki 1.49e+00 6.79e-01
## as.factor(neighbourhood)Kolonos 1.53e+00 7.80e-01
## as.factor(neighbourhood)Koukaki 1.74e+00 6.81e-01
## as.factor(neighbourhood)Kypseli 7.74e-01 7.29e-01
## as.factor(neighbourhood)Larissis 1.16e+00 6.94e-01
## as.factor(neighbourhood)Metaxourgeio 9.59e-01 6.91e-01
## as.factor(neighbourhood)Mets 1.77e+00 6.88e-01
## as.factor(neighbourhood)Monastiraki 1.77e+00 8.62e-01
## as.factor(neighbourhood)Neapoli 9.89e-01 6.99e-01
## as.factor(neighbourhood)Neos Kosmos 1.68e+00 6.79e-01
## as.factor(neighbourhood)Pangrati 1.29e+00 6.82e-01
## as.factor(neighbourhood)Patisia 7.40e-02 7.79e-01
## as.factor(neighbourhood)Pedion Areos 1.75e+00 7.31e-01
## as.factor(neighbourhood)Petralona 1.54e+00 7.24e-01
## as.factor(neighbourhood)Plaka 1.69e+00 6.78e-01
## as.factor(neighbourhood)Profitis Daniil 2.26e+00 9.66e-01
## as.factor(neighbourhood)Psyri 1.03e+00 6.96e-01
## as.factor(neighbourhood)Rizoupoli -7.35e-01 2.54e+00
## as.factor(neighbourhood)Rouf 3.62e-02 3.97e-01
## as.factor(neighbourhood)Sepolia 3.32e+00 2.54e+00
## as.factor(neighbourhood)Thiseio 2.34e+00 7.42e-01
## as.factor(neighbourhood)Votanikos 9.54e-01 1.41e+00
## cent_dist 2.02e-04 1.65e-04
## review_scores_rating 1.95e-03 8.73e-04
## reviews_per_month -1.03e+00 7.47e-02
## as.factor(neighbourhood)Agios Nikolaos:cent_dist -4.19e-05 2.23e-04
## as.factor(neighbourhood)Akadimia Platonos:cent_dist -8.14e-04 2.49e-04
## as.factor(neighbourhood)Ambelokipi:cent_dist -1.98e-04 1.68e-04
## as.factor(neighbourhood)Attiki:cent_dist -2.87e-04 2.05e-04
## as.factor(neighbourhood)Exarcheia:cent_dist -2.47e-04 1.91e-04
## as.factor(neighbourhood)Gazi:cent_dist 1.92e-04 3.38e-04
## as.factor(neighbourhood)Goudi:cent_dist 5.37e-04 7.14e-04
## as.factor(neighbourhood)Ilisia:cent_dist -3.82e-04 2.26e-04
## as.factor(neighbourhood)Kerameikos:cent_dist -3.61e-04 1.79e-04
## as.factor(neighbourhood)Kolonaki:cent_dist -2.92e-04 1.72e-04
## as.factor(neighbourhood)Kolonos:cent_dist -3.85e-04 2.17e-04
## as.factor(neighbourhood)Koukaki:cent_dist -4.87e-04 1.71e-04
## as.factor(neighbourhood)Kypseli:cent_dist -1.70e-04 1.91e-04
## as.factor(neighbourhood)Larissis:cent_dist -2.72e-04 1.85e-04
## as.factor(neighbourhood)Metaxourgeio:cent_dist -4.12e-05 1.81e-04
## as.factor(neighbourhood)Mets:cent_dist -6.34e-04 1.95e-04
## as.factor(neighbourhood)Monastiraki:cent_dist -3.01e-04 5.08e-04
## as.factor(neighbourhood)Neapoli:cent_dist -8.83e-05 2.03e-04
## as.factor(neighbourhood)Neos Kosmos:cent_dist -5.01e-04 1.69e-04
## as.factor(neighbourhood)Pangrati:cent_dist -3.25e-04 1.74e-04
## as.factor(neighbourhood)Patisia:cent_dist 2.18e-06 1.91e-04
## as.factor(neighbourhood)Pedion Areos:cent_dist -5.50e-04 2.06e-04
## as.factor(neighbourhood)Petralona:cent_dist -3.53e-04 1.87e-04
## as.factor(neighbourhood)Plaka:cent_dist -3.06e-04 1.74e-04
## as.factor(neighbourhood)Profitis Daniil:cent_dist -4.27e-04 2.86e-04
## as.factor(neighbourhood)Psyri:cent_dist 1.40e-04 2.09e-04
## as.factor(neighbourhood)Rizoupoli:cent_dist 1.10e-04 4.99e-04
## as.factor(neighbourhood)Rouf:cent_dist NA NA
## as.factor(neighbourhood)Sepolia:cent_dist -8.99e-04 7.04e-04
## as.factor(neighbourhood)Thiseio:cent_dist -6.93e-04 2.21e-04
## as.factor(neighbourhood)Votanikos:cent_dist -9.10e-05 4.52e-04
## review_scores_rating:reviews_per_month 1.01e-02 7.74e-04
## t value Pr(>|t|)
## (Intercept) 2.60 0.00943 **
## as.factor(accommodates)2 4.90 9.7e-07 ***
## as.factor(accommodates)3 5.81 6.7e-09 ***
## as.factor(accommodates)4 8.37 < 2e-16 ***
## as.factor(accommodates)5 8.63 < 2e-16 ***
## as.factor(accommodates)6 10.17 < 2e-16 ***
## as.factor(accommodates)7 9.46 < 2e-16 ***
## as.factor(accommodates)8 11.33 < 2e-16 ***
## as.factor(accommodates)9 9.03 < 2e-16 ***
## as.factor(accommodates)10 7.78 8.9e-15 ***
## as.factor(accommodates)11 4.55 5.5e-06 ***
## as.factor(accommodates)12 8.73 < 2e-16 ***
## as.factor(accommodates)13 1.84 0.06648 .
## as.factor(accommodates)14 5.97 2.5e-09 ***
## as.factor(accommodates)15 5.16 2.5e-07 ***
## as.factor(accommodates)16 9.73 < 2e-16 ***
## room_typePrivate room 2.07 0.03823 *
## room_typeHotel room -16.93 < 2e-16 ***
## room_typeShared room -13.41 < 2e-16 ***
## bedrooms 9.67 < 2e-16 ***
## bathrooms 13.02 < 2e-16 ***
## as.factor(neighbourhood)Agios Nikolaos 0.44 0.66096
## as.factor(neighbourhood)Akadimia Platonos 3.49 0.00048 ***
## as.factor(neighbourhood)Ambelokipi 1.53 0.12494
## as.factor(neighbourhood)Attiki 1.49 0.13662
## as.factor(neighbourhood)Exarcheia 1.65 0.09980 .
## as.factor(neighbourhood)Gazi 0.18 0.86060
## as.factor(neighbourhood)Goudi -0.62 0.53679
## as.factor(neighbourhood)Ilisia 2.08 0.03803 *
## as.factor(neighbourhood)Kerameikos 2.41 0.01593 *
## as.factor(neighbourhood)Kolonaki 2.20 0.02788 *
## as.factor(neighbourhood)Kolonos 1.96 0.05027 .
## as.factor(neighbourhood)Koukaki 2.56 0.01054 *
## as.factor(neighbourhood)Kypseli 1.06 0.28846
## as.factor(neighbourhood)Larissis 1.67 0.09580 .
## as.factor(neighbourhood)Metaxourgeio 1.39 0.16489
## as.factor(neighbourhood)Mets 2.57 0.01025 *
## as.factor(neighbourhood)Monastiraki 2.05 0.04015 *
## as.factor(neighbourhood)Neapoli 1.42 0.15690
## as.factor(neighbourhood)Neos Kosmos 2.48 0.01321 *
## as.factor(neighbourhood)Pangrati 1.89 0.05933 .
## as.factor(neighbourhood)Patisia 0.09 0.92433
## as.factor(neighbourhood)Pedion Areos 2.39 0.01676 *
## as.factor(neighbourhood)Petralona 2.13 0.03335 *
## as.factor(neighbourhood)Plaka 2.50 0.01254 *
## as.factor(neighbourhood)Profitis Daniil 2.34 0.01925 *
## as.factor(neighbourhood)Psyri 1.48 0.13883
## as.factor(neighbourhood)Rizoupoli -0.29 0.77191
## as.factor(neighbourhood)Rouf 0.09 0.92731
## as.factor(neighbourhood)Sepolia 1.31 0.19142
## as.factor(neighbourhood)Thiseio 3.16 0.00161 **
## as.factor(neighbourhood)Votanikos 0.67 0.49973
## cent_dist 1.22 0.22218
## review_scores_rating 2.23 0.02562 *
## reviews_per_month -13.74 < 2e-16 ***
## as.factor(neighbourhood)Agios Nikolaos:cent_dist -0.19 0.85109
## as.factor(neighbourhood)Akadimia Platonos:cent_dist -3.27 0.00107 **
## as.factor(neighbourhood)Ambelokipi:cent_dist -1.18 0.23999
## as.factor(neighbourhood)Attiki:cent_dist -1.40 0.16134
## as.factor(neighbourhood)Exarcheia:cent_dist -1.29 0.19728
## as.factor(neighbourhood)Gazi:cent_dist 0.57 0.56898
## as.factor(neighbourhood)Goudi:cent_dist 0.75 0.45163
## as.factor(neighbourhood)Ilisia:cent_dist -1.69 0.09055 .
## as.factor(neighbourhood)Kerameikos:cent_dist -2.01 0.04412 *
## as.factor(neighbourhood)Kolonaki:cent_dist -1.69 0.09034 .
## as.factor(neighbourhood)Kolonos:cent_dist -1.77 0.07616 .
## as.factor(neighbourhood)Koukaki:cent_dist -2.84 0.00451 **
## as.factor(neighbourhood)Kypseli:cent_dist -0.89 0.37354
## as.factor(neighbourhood)Larissis:cent_dist -1.47 0.14116
## as.factor(neighbourhood)Metaxourgeio:cent_dist -0.23 0.82043
## as.factor(neighbourhood)Mets:cent_dist -3.25 0.00115 **
## as.factor(neighbourhood)Monastiraki:cent_dist -0.59 0.55367
## as.factor(neighbourhood)Neapoli:cent_dist -0.43 0.66422
## as.factor(neighbourhood)Neos Kosmos:cent_dist -2.97 0.00302 **
## as.factor(neighbourhood)Pangrati:cent_dist -1.87 0.06209 .
## as.factor(neighbourhood)Patisia:cent_dist 0.01 0.99091
## as.factor(neighbourhood)Pedion Areos:cent_dist -2.67 0.00770 **
## as.factor(neighbourhood)Petralona:cent_dist -1.88 0.05955 .
## as.factor(neighbourhood)Plaka:cent_dist -1.76 0.07835 .
## as.factor(neighbourhood)Profitis Daniil:cent_dist -1.49 0.13623
## as.factor(neighbourhood)Psyri:cent_dist 0.67 0.50333
## as.factor(neighbourhood)Rizoupoli:cent_dist 0.22 0.82615
## as.factor(neighbourhood)Rouf:cent_dist NA NA
## as.factor(neighbourhood)Sepolia:cent_dist -1.28 0.20149
## as.factor(neighbourhood)Thiseio:cent_dist -3.14 0.00172 **
## as.factor(neighbourhood)Votanikos:cent_dist -0.20 0.84064
## review_scores_rating:reviews_per_month 13.07 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.376 on 5575 degrees of freedom
## Multiple R-squared: 0.547, Adjusted R-squared: 0.541
## F-statistic: 79.3 on 85 and 5575 DF, p-value: <2e-16
## [1] 0.547
## [1] 5092
# With these adjustments our model our model outperforms any other model we tried, and is still fairly understandable. Our R2 is around 55%, and our Akaike infromation criterion is 5092, which is much lower than our first tries which were around 12000.
ggplot(model4, aes(x = .fitted, y = .resid)) +
geom_point() +
labs(title = "Residuals vary around zero") +
ylab("Residual") +
xlab("")# Choose models with the lowest MSPE (mean squared prediction error)
# Model 1
mean((log(test$price) - predict.lm(model1, test)) ^ 2, na.rm=T)## [1] 0.232
## [1] 0.207
## [1] 0.199
## [1] 0.188
## [1] 0.172
## [1] 0.159
We used stepwise method to look for the lowest possible AIC model, but it contained variables which would be hard to defend logically
full.model <- lm(log(price) ~., data = na.omit(train))
step.model <- stepAIC(full.model, direction = “both”, trace = FALSE)
step.model %>% summary() %>% select(coefficients)
as.data.frame(summary(step.model)$coefficients) %>% arrange(Estimate)
First let’s visualize is our errors could be explained by their location
pred_price <- exp(predict(model6, test, se.fit = TRUE)$fit)
pred_test <- test %>%
cbind(pred_price) %>%
mutate(
pred_error = (test$price - pred_price) / pred_price
)
athens_map +
geom_point(data=pred_test, aes(x = longitude, y = latitude, color = pred_error*100)) +
geom_point(aes(x = syntagma['latitude'], syntagma['longitude']), color = 'red', size = 5) +
map_theme +
labs(title = "Our residuals do not correlate with distance" , subtitle = "Colors represent error level") +
scale_color_continuous(name="Error level (%)")Finally let’s see how much would be a night if me and my fried would like to visit Athene and would like to leave in 1.5km radius of the centre.
pred_friend <- test %>%
filter(
cent_dist <= 1500,
accommodates == 2
)
ggplot(data=pred_friend, aes(x=price, y=exp(predict(model6, pred_friend, se.fit = TRUE)$fit))) +
geom_point() +
geom_abline(intercept = 0, slope = 1) +
labs(title = "Our model predicts well most of the prices, except some extreme prices",
subtitle = "Predicted vs actual prices (1.5km from centre, 2 accomodation)") +
ylab("Predicted price") +
xlab("Actual price")Would you like to travel with more friends? Or would you like to move further our?
Use this tool to find out more about our predictions and actuals
library(shiny)
ui <- fluidPage(
titlePanel(title=h4("Predicted Athene airbnb prices", align="center")),
sidebarPanel(
numericInput("cent_dist", label="How far from the centre (max value)?", value=1500),
numericInput("accom", label="How many people?", value=2),
selectInput("bathrooms", label="How many bathrooms?",
choices = c("Any", unique(test$bathrooms)))),
mainPanel(plotOutput("plot2")),
tableOutput("table"))
server <- function(input,output){
dat <- reactive({
data <- pred_friend <- test %>%
filter(
cent_dist <= input$cent_dist,
ifelse(input$bathrooms != "Any",
bathrooms == input$bathrooms,
bathrooms == bathrooms),
accommodates == input$accom
)
return(data)
})
output$table <- renderTable({
table <- predict(model6, newdata = dat(),
interval = "confidence") %>%
exp() %>%
data.frame() %>%
summarize(lower_bound = mean(lwr),
predicted_price = mean(fit),
upper_bound = mean(upr))
names(table) <- c("Lower CI Prediction", "Mean Prediction", "Upper CI Prediction")
return(table)
})
output$plot2<-renderPlot({
reac_data <- dat()
acc_str <- paste0("Accomodates:", input$accom, sep=" ")
cent_str <- paste0("Distance from centre:", input$cent_dist, sep=" ")
bathrooms_str <- paste0("Bathrooms:", input$bathrooms, sep=" ")
ggplot(data=reac_data, aes(x=price,
y=exp(predict(model6, reac_data, se.fit = TRUE)$fit))) +
geom_point() +
geom_abline(intercept = 0, slope = 1) +
labs(title = "Prediction vs. Actual prices in Athene",
subtitle = paste0(acc_str, cent_str, bathrooms_str, sep=" ")) +
ylab("Predicted price") +
xlab("Actual price")
})
}
shinyApp(ui, server)